In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import plotly.graph_objects as go
In [2]:
# Load the Iris dataset
iris = load_iris()

iris
Out[2]:
{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
        [5.5, 4.2, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.2],
        [5. , 3.2, 1.2, 0.2],
        [5.5, 3.5, 1.3, 0.2],
        [4.9, 3.6, 1.4, 0.1],
        [4.4, 3. , 1.3, 0.2],
        [5.1, 3.4, 1.5, 0.2],
        [5. , 3.5, 1.3, 0.3],
        [4.5, 2.3, 1.3, 0.3],
        [4.4, 3.2, 1.3, 0.2],
        [5. , 3.5, 1.6, 0.6],
        [5.1, 3.8, 1.9, 0.4],
        [4.8, 3. , 1.4, 0.3],
        [5.1, 3.8, 1.6, 0.2],
        [4.6, 3.2, 1.4, 0.2],
        [5.3, 3.7, 1.5, 0.2],
        [5. , 3.3, 1.4, 0.2],
        [7. , 3.2, 4.7, 1.4],
        [6.4, 3.2, 4.5, 1.5],
        [6.9, 3.1, 4.9, 1.5],
        [5.5, 2.3, 4. , 1.3],
        [6.5, 2.8, 4.6, 1.5],
        [5.7, 2.8, 4.5, 1.3],
        [6.3, 3.3, 4.7, 1.6],
        [4.9, 2.4, 3.3, 1. ],
        [6.6, 2.9, 4.6, 1.3],
        [5.2, 2.7, 3.9, 1.4],
        [5. , 2. , 3.5, 1. ],
        [5.9, 3. , 4.2, 1.5],
        [6. , 2.2, 4. , 1. ],
        [6.1, 2.9, 4.7, 1.4],
        [5.6, 2.9, 3.6, 1.3],
        [6.7, 3.1, 4.4, 1.4],
        [5.6, 3. , 4.5, 1.5],
        [5.8, 2.7, 4.1, 1. ],
        [6.2, 2.2, 4.5, 1.5],
        [5.6, 2.5, 3.9, 1.1],
        [5.9, 3.2, 4.8, 1.8],
        [6.1, 2.8, 4. , 1.3],
        [6.3, 2.5, 4.9, 1.5],
        [6.1, 2.8, 4.7, 1.2],
        [6.4, 2.9, 4.3, 1.3],
        [6.6, 3. , 4.4, 1.4],
        [6.8, 2.8, 4.8, 1.4],
        [6.7, 3. , 5. , 1.7],
        [6. , 2.9, 4.5, 1.5],
        [5.7, 2.6, 3.5, 1. ],
        [5.5, 2.4, 3.8, 1.1],
        [5.5, 2.4, 3.7, 1. ],
        [5.8, 2.7, 3.9, 1.2],
        [6. , 2.7, 5.1, 1.6],
        [5.4, 3. , 4.5, 1.5],
        [6. , 3.4, 4.5, 1.6],
        [6.7, 3.1, 4.7, 1.5],
        [6.3, 2.3, 4.4, 1.3],
        [5.6, 3. , 4.1, 1.3],
        [5.5, 2.5, 4. , 1.3],
        [5.5, 2.6, 4.4, 1.2],
        [6.1, 3. , 4.6, 1.4],
        [5.8, 2.6, 4. , 1.2],
        [5. , 2.3, 3.3, 1. ],
        [5.6, 2.7, 4.2, 1.3],
        [5.7, 3. , 4.2, 1.2],
        [5.7, 2.9, 4.2, 1.3],
        [6.2, 2.9, 4.3, 1.3],
        [5.1, 2.5, 3. , 1.1],
        [5.7, 2.8, 4.1, 1.3],
        [6.3, 3.3, 6. , 2.5],
        [5.8, 2.7, 5.1, 1.9],
        [7.1, 3. , 5.9, 2.1],
        [6.3, 2.9, 5.6, 1.8],
        [6.5, 3. , 5.8, 2.2],
        [7.6, 3. , 6.6, 2.1],
        [4.9, 2.5, 4.5, 1.7],
        [7.3, 2.9, 6.3, 1.8],
        [6.7, 2.5, 5.8, 1.8],
        [7.2, 3.6, 6.1, 2.5],
        [6.5, 3.2, 5.1, 2. ],
        [6.4, 2.7, 5.3, 1.9],
        [6.8, 3. , 5.5, 2.1],
        [5.7, 2.5, 5. , 2. ],
        [5.8, 2.8, 5.1, 2.4],
        [6.4, 3.2, 5.3, 2.3],
        [6.5, 3. , 5.5, 1.8],
        [7.7, 3.8, 6.7, 2.2],
        [7.7, 2.6, 6.9, 2.3],
        [6. , 2.2, 5. , 1.5],
        [6.9, 3.2, 5.7, 2.3],
        [5.6, 2.8, 4.9, 2. ],
        [7.7, 2.8, 6.7, 2. ],
        [6.3, 2.7, 4.9, 1.8],
        [6.7, 3.3, 5.7, 2.1],
        [7.2, 3.2, 6. , 1.8],
        [6.2, 2.8, 4.8, 1.8],
        [6.1, 3. , 4.9, 1.8],
        [6.4, 2.8, 5.6, 2.1],
        [7.2, 3. , 5.8, 1.6],
        [7.4, 2.8, 6.1, 1.9],
        [7.9, 3.8, 6.4, 2. ],
        [6.4, 2.8, 5.6, 2.2],
        [6.3, 2.8, 5.1, 1.5],
        [6.1, 2.6, 5.6, 1.4],
        [7.7, 3. , 6.1, 2.3],
        [6.3, 3.4, 5.6, 2.4],
        [6.4, 3.1, 5.5, 1.8],
        [6. , 3. , 4.8, 1.8],
        [6.9, 3.1, 5.4, 2.1],
        [6.7, 3.1, 5.6, 2.4],
        [6.9, 3.1, 5.1, 2.3],
        [5.8, 2.7, 5.1, 1.9],
        [6.8, 3.2, 5.9, 2.3],
        [6.7, 3.3, 5.7, 2.5],
        [6.7, 3. , 5.2, 2.3],
        [6.3, 2.5, 5. , 1.9],
        [6.5, 3. , 5.2, 2. ],
        [6.2, 3.4, 5.4, 2.3],
        [5.9, 3. , 5.1, 1.8]]),
 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]),
 'frame': None,
 'target_names': array(['setosa', 'versicolor', 'virginica'], dtype='<U10'),
 'DESCR': '.. _iris_dataset:\n\nIris plants dataset\n--------------------\n\n**Data Set Characteristics:**\n\n    :Number of Instances: 150 (50 in each of three classes)\n    :Number of Attributes: 4 numeric, predictive attributes and the class\n    :Attribute Information:\n        - sepal length in cm\n        - sepal width in cm\n        - petal length in cm\n        - petal width in cm\n        - class:\n                - Iris-Setosa\n                - Iris-Versicolour\n                - Iris-Virginica\n                \n    :Summary Statistics:\n\n    ============== ==== ==== ======= ===== ====================\n                    Min  Max   Mean    SD   Class Correlation\n    ============== ==== ==== ======= ===== ====================\n    sepal length:   4.3  7.9   5.84   0.83    0.7826\n    sepal width:    2.0  4.4   3.05   0.43   -0.4194\n    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)\n    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)\n    ============== ==== ==== ======= ===== ====================\n\n    :Missing Attribute Values: None\n    :Class Distribution: 33.3% for each of 3 classes.\n    :Creator: R.A. Fisher\n    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)\n    :Date: July, 1988\n\nThe famous Iris database, first used by Sir R.A. Fisher. The dataset is taken\nfrom Fisher\'s paper. Note that it\'s the same as in R, but not as in the UCI\nMachine Learning Repository, which has two wrong data points.\n\nThis is perhaps the best known database to be found in the\npattern recognition literature.  Fisher\'s paper is a classic in the field and\nis referenced frequently to this day.  (See Duda & Hart, for example.)  The\ndata set contains 3 classes of 50 instances each, where each class refers to a\ntype of iris plant.  One class is linearly separable from the other 2; the\nlatter are NOT linearly separable from each other.\n\n.. topic:: References\n\n   - Fisher, R.A. "The use of multiple measurements in taxonomic problems"\n     Annual Eugenics, 7, Part II, 179-188 (1936); also in "Contributions to\n     Mathematical Statistics" (John Wiley, NY, 1950).\n   - Duda, R.O., & Hart, P.E. (1973) Pattern Classification and Scene Analysis.\n     (Q327.D83) John Wiley & Sons.  ISBN 0-471-22361-1.  See page 218.\n   - Dasarathy, B.V. (1980) "Nosing Around the Neighborhood: A New System\n     Structure and Classification Rule for Recognition in Partially Exposed\n     Environments".  IEEE Transactions on Pattern Analysis and Machine\n     Intelligence, Vol. PAMI-2, No. 1, 67-71.\n   - Gates, G.W. (1972) "The Reduced Nearest Neighbor Rule".  IEEE Transactions\n     on Information Theory, May 1972, 431-433.\n   - See also: 1988 MLC Proceedings, 54-64.  Cheeseman et al"s AUTOCLASS II\n     conceptual clustering system finds 3 classes in the data.\n   - Many, many more ...',
 'feature_names': ['sepal length (cm)',
  'sepal width (cm)',
  'petal length (cm)',
  'petal width (cm)'],
 'filename': 'iris.csv',
 'data_module': 'sklearn.datasets.data'}
In [3]:
# Extract features and target variable
X = iris.data
y = iris.target

# Redefine the target variable into two classes: 'virginica' and 'non-virginica'
y_binary = np.where(y == 2, 'virginica', 'non-virginica')

# Create a DataFrame
iris_df = pd.DataFrame(data=X, columns=iris.feature_names)
iris_df['target'] = y_binary
In [4]:
#creating the separate dataframe for each target variable
virginica_data = iris_df[iris_df['target'] == 'virginica']
non_virginica_data = iris_df[iris_df['target'] == 'non-virginica']

# statistics for each variable
virginica_stats = virginica_data.describe()
non_virginica_stats = non_virginica_data.describe()

# Display the tables
print("Descriptive Statistics for Virginica Class:")
print(virginica_stats)

print("\nDescriptive Statistics for Non-Virginica Class:")
print(non_virginica_stats)
Descriptive Statistics for Virginica Class:
       sepal length (cm)  sepal width (cm)  petal length (cm)  \
count           50.00000         50.000000          50.000000   
mean             6.58800          2.974000           5.552000   
std              0.63588          0.322497           0.551895   
min              4.90000          2.200000           4.500000   
25%              6.22500          2.800000           5.100000   
50%              6.50000          3.000000           5.550000   
75%              6.90000          3.175000           5.875000   
max              7.90000          3.800000           6.900000   

       petal width (cm)  
count          50.00000  
mean            2.02600  
std             0.27465  
min             1.40000  
25%             1.80000  
50%             2.00000  
75%             2.30000  
max             2.50000  

Descriptive Statistics for Non-Virginica Class:
       sepal length (cm)  sepal width (cm)  petal length (cm)  \
count         100.000000        100.000000         100.000000   
mean            5.471000          3.099000           2.861000   
std             0.641698          0.478739           1.449549   
min             4.300000          2.000000           1.000000   
25%             5.000000          2.800000           1.500000   
50%             5.400000          3.050000           2.450000   
75%             5.900000          3.400000           4.325000   
max             7.000000          4.400000           5.100000   

       petal width (cm)  
count        100.000000  
mean           0.786000  
std            0.565153  
min            0.100000  
25%            0.200000  
50%            0.800000  
75%            1.300000  
max            1.800000  
In [5]:
# Plot histograms for each feature
for feature in iris.feature_names:
    plt.figure(figsize=(8, 5))
    sns.histplot(data=iris_df, x=feature, hue='target', kde=True)
    plt.title(f'Histogram for {feature} in Virginica and Non-Virginica Classes')
    plt.show()

In each graph, the histograms allow you to observe the spread and concentration of data points for each feature within the "Virginica" and "Non-Virginica" classes. For Sepal Length, non- virginica is right skewed whereas virginica is left skewed. For sepal width, non-virginica is approximately normalized in form, whereas the virginica show sign of right skewness. For Petal length and petal width does not provide a visual answer, which means there are invidence of NaN or null values.

In [6]:
# Correlation matrix
correlation_matrix = iris_df.corr()
print(correlation_matrix)
                   sepal length (cm)  sepal width (cm)  petal length (cm)  \
sepal length (cm)           1.000000         -0.117570           0.871754   
sepal width (cm)           -0.117570          1.000000          -0.428440   
petal length (cm)           0.871754         -0.428440           1.000000   
petal width (cm)            0.817941         -0.366126           0.962865   

                   petal width (cm)  
sepal length (cm)          0.817941  
sepal width (cm)          -0.366126  
petal length (cm)          0.962865  
petal width (cm)           1.000000  
C:\Users\techv\AppData\Local\Temp\ipykernel_21548\2453095933.py:2: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  correlation_matrix = iris_df.corr()
In [7]:
# Plot the correlation matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Matrix of Iris Features')
plt.show()

The above graph provides the relationship with respect to features. For instances:

  • Petal length and petal width provides a strong positive relation.
  • sepal length and sepal width provides a negative relation.
In [8]:
# Violin plots for all features
features = iris.feature_names

for feature in features:
    plt.figure(figsize=(12, 6))
    sns.violinplot(x="target", y=feature, data=iris_df, inner="quartile", palette={"virginica": "purple", "non-virginica": "orange"})
    plt.title(f"Violin Plot of {feature} by Class")
    plt.show()

Violin plots are used to visualize the distribution of a numeric variable across different categories. The violin shape represents the probability density of the data at different values. Wider sections indicate a higher probability density, and narrower sections indicate lower density.

For More Information: Violin Plot

In [9]:
# Plot box plots for each feature with respect to the target variable
for feature in iris.feature_names:
    plt.figure(figsize=(12, 6))
    sns.boxplot(x='target', y=feature, data=iris_df, palette={"virginica": "purple", "non-virginica": "orange"})
    plt.title(f'Box Plot of {feature} by Class')
    plt.show()

The above box plots provide a visual representation of the distribution of each feature in the Iris dataset with respect to the target variable, which has two classes: 'virginica' and 'non-virginica'. Points outside the whiskers are considered outliers. It provides insights, into the central tendency, spread, and potential outliers for each feature, helping to understand the distribution of the data

For more Information: Box Plot

In [10]:
# Pair plot
sns.set(style="ticks")
sns.pairplot(iris_df, hue="target", palette={"virginica": "purple", "non-virginica": "orange"})
plt.suptitle("Pair Plot of Iris Features by Class", y=1.02)
plt.show()
C:\Users\techv\anaconda3\Lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight
  self._figure.tight_layout(*args, **kwargs)

The pair plot provides scatterplots for each pair of features, colored by class ('virginica' or 'non-virginica'). It helps us visualize relationships between different features and observe potential patterns or separations between classes.

For more information: Pair Plot

In [11]:
# Split the data into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y_binary, test_size=0.25, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.6, random_state=42)
In [12]:
# Function to train logistics regression models
def train(features):
    # Select the specified features
    X_train_subset = X_train[:, features]
    X_val_subset = X_val[:, features]
    X_test_subset = X_test[:, features]

    # Train logistic regression model
    model = LogisticRegression(random_state=42)
    model.fit(X_train_subset, y_train)

    # Predictions on the validation set
    y_val_pred = model.predict(X_val_subset)

    # Calculate accuracy on the validation set
    accuracy = accuracy_score(y_val, y_val_pred)

    return model, accuracy
In [13]:
# List of feature combinations to try
feature_combinations = [
    [0],  # One by One feature
    [1],  
    [2],  
    [3],  
    [0, 1],  # Two features
    [0, 2],  
    [1, 2],  
    [0, 1, 2],  # Three features
]
In [14]:
# Train for each feature combination
for features in feature_combinations:
    model, accuracy = train(features)
    print(f"Features: {features}, Accuracy: {accuracy:.2f}")
Features: [0], Accuracy: 0.87
Features: [1], Accuracy: 0.60
Features: [2], Accuracy: 1.00
Features: [3], Accuracy: 1.00
Features: [0, 1], Accuracy: 0.93
Features: [0, 2], Accuracy: 1.00
Features: [1, 2], Accuracy: 1.00
Features: [0, 1, 2], Accuracy: 1.00
In [15]:
# Train for each feature combination
for features in feature_combinations:
    model, accuracy = train(features)
    
# Predictions on the validation set
    y_val_pred_prob = model.predict_proba(X_val[:, features])[:, 1]
    y_val_pred = model.predict(X_val[:, features]) 
    
# Create a DataFrame for each model
    result_df = pd.DataFrame({
        'Instance': range(1, len(X_val) + 1),
        'Probability_Virginica': y_val_pred_prob,
        'Predicted': y_val_pred,
        'Ground Truth': y_val
    })
    
# Display the table
    print(f"Table for Features: {features}, Accuracy: {accuracy:.2f}")
    print(result_df)
    
    
#calculate the precision, recall and f1
    precision = precision_score(y_val, y_val_pred, pos_label='virginica', average='binary')
    recall = recall_score(y_val, y_val_pred, pos_label='virginica', average='binary')
    f1_score_value = f1_score(y_val, y_val_pred, pos_label='virginica', average='binary')

# Summarize the data
    precision_classes = precision_score(y_val, y_val_pred, average=None)
    recall_classes = recall_score(y_val, y_val_pred, average=None)
    f1_classes = f1_score(y_val, y_val_pred, average=None)
    
    print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1-Score: {f1_score_value:.2f}\n")
    
   
Table for Features: [0], Accuracy: 0.87
    Instance  Probability_Virginica      Predicted   Ground Truth
0          1               0.398271  non-virginica      virginica
1          2               0.943719      virginica      virginica
2          3               0.045704  non-virginica  non-virginica
3          4               0.497836  non-virginica      virginica
4          5               0.350992  non-virginica  non-virginica
5          6               0.020901  non-virginica  non-virginica
6          7               0.548189      virginica      virginica
7          8               0.227811  non-virginica  non-virginica
8          9               0.597573      virginica      virginica
9         10               0.066934  non-virginica  non-virginica
10        11               0.227811  non-virginica  non-virginica
11        12               0.037659  non-virginica  non-virginica
12        13               0.769129      virginica      virginica
13        14               0.080708  non-virginica  non-virginica
14        15               0.045704  non-virginica  non-virginica
Precision: 1.00, Recall: 0.67, F1-Score: 0.80

Table for Features: [1], Accuracy: 0.60
    Instance  Probability_Virginica      Predicted   Ground Truth
0          1               0.341824  non-virginica      virginica
1          2               0.379518  non-virginica      virginica
2          3               0.332684  non-virginica  non-virginica
3          4               0.389195  non-virginica      virginica
4          5               0.351083  non-virginica  non-virginica
5          6               0.323668  non-virginica  non-virginica
6          7               0.360456  non-virginica      virginica
7          8               0.226581  non-virginica  non-virginica
8          9               0.323668  non-virginica      virginica
9         10               0.306027  non-virginica  non-virginica
10        11               0.360456  non-virginica  non-virginica
11        12               0.323668  non-virginica  non-virginica
12        13               0.332684  non-virginica      virginica
13        14               0.272427  non-virginica  non-virginica
14        15               0.341824  non-virginica  non-virginica
Precision: 0.00, Recall: 0.00, F1-Score: 0.00

Table for Features: [2], Accuracy: 1.00
    Instance  Probability_Virginica      Predicted   Ground Truth
0          1               0.522854      virginica      virginica
1          2               0.998485      virginica      virginica
2          3               0.000028  non-virginica  non-virginica
3          4               0.601438      virginica      virginica
4          5               0.233537  non-virginica  non-virginica
5          6               0.000011  non-virginica  non-virginica
6          7               0.911441      virginica      virginica
7          8               0.000021  non-virginica  non-virginica
8          9               0.675122      virginica      virginica
9         10               0.000028  non-virginica  non-virginica
10        11               0.233537  non-virginica  non-virginica
11        12               0.000028  non-virginica  non-virginica
12        13               0.675122      virginica      virginica
13        14               0.000021  non-virginica  non-virginica
14        15               0.000015  non-virginica  non-virginica
Precision: 1.00, Recall: 1.00, F1-Score: 1.00

Table for Features: [3], Accuracy: 1.00
    Instance  Probability_Virginica      Predicted   Ground Truth
0          1               0.639351      virginica      virginica
1          2               0.923490      virginica      virginica
2          3               0.003812  non-virginica  non-virginica
3          4               0.722358      virginica      virginica
4          5               0.359309  non-virginica  non-virginica
5          6               0.003812  non-virginica  non-virginica
6          7               0.848574      virginica      virginica
7          8               0.008176  non-virginica  non-virginica
8          9               0.792461      virginica      virginica
9         10               0.008176  non-virginica  non-virginica
10        11               0.206584  non-virginica  non-virginica
11        12               0.003812  non-virginica  non-virginica
12        13               0.923490      virginica      virginica
13        14               0.005585  non-virginica  non-virginica
14        15               0.005585  non-virginica  non-virginica
Precision: 1.00, Recall: 1.00, F1-Score: 1.00

Table for Features: [0, 1], Accuracy: 0.93
    Instance  Probability_Virginica      Predicted   Ground Truth
0          1               0.395505  non-virginica      virginica
1          2               0.951534      virginica      virginica
2          3               0.043378  non-virginica  non-virginica
3          4               0.547310      virginica      virginica
4          5               0.357930  non-virginica  non-virginica
5          6               0.019006  non-virginica  non-virginica
6          7               0.566111      virginica      virginica
7          8               0.139412  non-virginica  non-virginica
8          9               0.574497      virginica      virginica
9         10               0.056509  non-virginica  non-virginica
10        11               0.240730  non-virginica  non-virginica
11        12               0.034306  non-virginica  non-virginica
12        13               0.759617      virginica      virginica
13        14               0.058362  non-virginica  non-virginica
14        15               0.045153  non-virginica  non-virginica
Precision: 1.00, Recall: 0.83, F1-Score: 0.91

Table for Features: [0, 2], Accuracy: 1.00
    Instance  Probability_Virginica      Predicted   Ground Truth
0          1               0.531650      virginica      virginica
1          2               0.997731      virginica      virginica
2          3               0.000031  non-virginica  non-virginica
3          4               0.589951      virginica      virginica
4          5               0.235600  non-virginica  non-virginica
5          6               0.000014  non-virginica  non-virginica
6          7               0.912517      virginica      virginica
7          8               0.000014  non-virginica  non-virginica
8          9               0.645830      virginica      virginica
9         10               0.000028  non-virginica  non-virginica
10        11               0.264153  non-virginica  non-virginica
11        12               0.000032  non-virginica  non-virginica
12        13               0.598082      virginica      virginica
13        14               0.000019  non-virginica  non-virginica
14        15               0.000016  non-virginica  non-virginica
Precision: 1.00, Recall: 1.00, F1-Score: 1.00

Table for Features: [1, 2], Accuracy: 1.00
    Instance  Probability_Virginica      Predicted   Ground Truth
0          1               0.510675      virginica      virginica
1          2               0.998768      virginica      virginica
2          3               0.000022  non-virginica  non-virginica
3          4               0.637675      virginica      virginica
4          5               0.228453  non-virginica  non-virginica
5          6               0.000008  non-virginica  non-virginica
6          7               0.916493      virginica      virginica
7          8               0.000010  non-virginica  non-virginica
8          9               0.648686      virginica      virginica
9         10               0.000020  non-virginica  non-virginica
10        11               0.235498  non-virginica  non-virginica
11        12               0.000021  non-virginica  non-virginica
12        13               0.657645      virginica      virginica
13        14               0.000012  non-virginica  non-virginica
14        15               0.000012  non-virginica  non-virginica
Precision: 1.00, Recall: 1.00, F1-Score: 1.00

Table for Features: [0, 1, 2], Accuracy: 1.00
    Instance  Probability_Virginica      Predicted   Ground Truth
0          1               0.522413      virginica      virginica
1          2               0.998105      virginica      virginica
2          3               0.000026  non-virginica  non-virginica
3          4               0.616890      virginica      virginica
4          5               0.231774  non-virginica  non-virginica
5          6               0.000011  non-virginica  non-virginica
6          7               0.916058      virginica      virginica
7          8               0.000008  non-virginica  non-virginica
8          9               0.629051      virginica      virginica
9         10               0.000021  non-virginica  non-virginica
10        11               0.263068  non-virginica  non-virginica
11        12               0.000026  non-virginica  non-virginica
12        13               0.591204      virginica      virginica
13        14               0.000013  non-virginica  non-virginica
14        15               0.000013  non-virginica  non-virginica
Precision: 1.00, Recall: 1.00, F1-Score: 1.00

C:\Users\techv\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1469: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
C:\Users\techv\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1469: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))

The result are each feature combinations, while providing the accuracy, precision, recall and f1-score for the first 15 insatnces.

In [16]:
# Create subplots for each feature combination
fig = go.Figure()

for features in feature_combinations:
    model, accuracy = train(features)

    if len(features) == 1:
        x1_range = np.linspace(X_val[:, features[0]].min(), X_val[:, features[0]].max(), 100)
        x1, = np.meshgrid(x1_range)
        decision_boundary = -model.intercept_ / model.coef_[0] * x1
        fig.add_trace(go.Scatter(x=x1_range, y=decision_boundary.flatten(), mode='lines', name=f'Decision Boundary ({features})'))
    elif len(features) == 2:
        x1_range = np.linspace(X_val[:, features[0]].min(), X_val[:, features[0]].max(), 100)
        x2_range = np.linspace(X_val[:, features[1]].min(), X_val[:, features[1]].max(), 100)
        x1, x2 = np.meshgrid(x1_range, x2_range)

        if len(features) == 2:
            decision_boundary = - (model.intercept_ + model.coef_[0][0] * x1 + model.coef_[0][1] * x2) / model.coef_[0][1]
        else:
            decision_boundary = - (model.intercept_ + model.coef_[0][0] * x1 + model.coef_[0][1] * x2) / model.coef_[0][2]

        fig.add_trace(go.Surface(x=x1, y=x2, z=decision_boundary, opacity=0.8, colorscale='Viridis', name=f'Decision Boundary ({features})'))

# Plot scatter points for each class
for class_label, color in zip(['non-virginica', 'virginica'], ['blue', 'red']):
    class_indices = (y_val == class_label)
    fig.add_trace(go.Scatter3d(
        x=X_val[class_indices, 0],
        y=X_val[class_indices, 1],
        z=X_val[class_indices, 2],
        mode='markers',
        marker=dict(color=color),
        name=class_label
    ))

# Set layout
fig.update_layout(scene=dict(
    xaxis_title=iris.feature_names[0],
    yaxis_title=iris.feature_names[1],
    zaxis_title=iris.feature_names[2],
))

# Show plot
fig.show()

The surfaces in the plot represent the decision boundaries that separate the two classes ('non-virginica' and 'virginica') based on the logistic regression models. Blue points belong to the 'non-virginica' class, and red points belong to the 'virginica' class. In this case, the decision boundaries appear to be fairly well-defined, which suggests that the algorithm is able to accurately classify the Iris flowers based on these two features.

In [17]:
# Train for each feature combination and analyze confusion matrix
for features in feature_combinations:
    model, accuracy = train(features)
    
    # Predictions on the validation set
    y_val_pred = model.predict(X_val[:, features])
    
    # Confusion matrix
    cm = confusion_matrix(y_val, y_val_pred, labels=['non-virginica', 'virginica'])
    
    # Display confusion matrix
    print(f"Confusion Matrix for Features: {features}")
    print(pd.DataFrame(cm, index=['Actual Non-Virginica', 'Actual Virginica'], columns=['Predicted Non-Virginica', 'Predicted Virginica']))
    
    # Analyze failure modes
    false_positives = X_val[(y_val == 'non-virginica') & (y_val_pred == 'virginica')]
    false_negatives = X_val[(y_val == 'virginica') & (y_val_pred == 'non-virginica')]
    
    # Display failure modes
    print(f"False Positives for Features: {features}")
    print(false_positives)
    
    print(f"False Negatives for Features: {features}")
    print(false_negatives)
    print("\n")
Confusion Matrix for Features: [0]
                      Predicted Non-Virginica  Predicted Virginica
Actual Non-Virginica                        9                    0
Actual Virginica                            2                    4
False Positives for Features: [0]
[]
False Negatives for Features: [0]
[[6.1 3.  4.9 1.8]
 [6.3 2.5 5.  1.9]]


Confusion Matrix for Features: [1]
                      Predicted Non-Virginica  Predicted Virginica
Actual Non-Virginica                        9                    0
Actual Virginica                            6                    0
False Positives for Features: [1]
[]
False Negatives for Features: [1]
[[6.1 3.  4.9 1.8]
 [7.7 2.6 6.9 2.3]
 [6.3 2.5 5.  1.9]
 [6.4 2.8 5.6 2.1]
 [6.5 3.2 5.1 2. ]
 [6.9 3.1 5.1 2.3]]


Confusion Matrix for Features: [2]
                      Predicted Non-Virginica  Predicted Virginica
Actual Non-Virginica                        9                    0
Actual Virginica                            0                    6
False Positives for Features: [2]
[]
False Negatives for Features: [2]
[]


Confusion Matrix for Features: [3]
                      Predicted Non-Virginica  Predicted Virginica
Actual Non-Virginica                        9                    0
Actual Virginica                            0                    6
False Positives for Features: [3]
[]
False Negatives for Features: [3]
[]


Confusion Matrix for Features: [0, 1]
                      Predicted Non-Virginica  Predicted Virginica
Actual Non-Virginica                        9                    0
Actual Virginica                            1                    5
False Positives for Features: [0, 1]
[]
False Negatives for Features: [0, 1]
[[6.1 3.  4.9 1.8]]


Confusion Matrix for Features: [0, 2]
                      Predicted Non-Virginica  Predicted Virginica
Actual Non-Virginica                        9                    0
Actual Virginica                            0                    6
False Positives for Features: [0, 2]
[]
False Negatives for Features: [0, 2]
[]


Confusion Matrix for Features: [1, 2]
                      Predicted Non-Virginica  Predicted Virginica
Actual Non-Virginica                        9                    0
Actual Virginica                            0                    6
False Positives for Features: [1, 2]
[]
False Negatives for Features: [1, 2]
[]


Confusion Matrix for Features: [0, 1, 2]
                      Predicted Non-Virginica  Predicted Virginica
Actual Non-Virginica                        9                    0
Actual Virginica                            0                    6
False Positives for Features: [0, 1, 2]
[]
False Negatives for Features: [0, 1, 2]
[]


  • Features 2 and 3 individually, as well as combinations including them ([2], [3], [0, 2], [1, 2], [0, 1, 2]), lead to perfect classification of the validation set (no false negatives or false positives). This suggests that features 2 and 3 are highly informative for distinguishing between "non-virginica" and "virginica" classes.
  • Feature 1, either alone or with feature 0, results in more false negatives, indicating it might be less reliable for identifying "virginica" samples.
  • Feature 0 doesn't seem to provide significant additional information when combined with features 2 and 3.
In [18]:
# find the best model based on accuracy
best_model = None
best_accuracy = 0.0
best_features = None #had to reset because features variable retains the last value of the loop

for features in feature_combinations:
    model, accuracy = train(features)
    
    # best model 
    if accuracy > best_accuracy:
        best_model = model
        best_accuracy = accuracy
        best_features = features

# Display the best model and its accuracy
print(f"Best Model Features: {best_features}, Best Accuracy: {best_accuracy:.2f}")

# Evaluate the best model on the test set
y_test_pred = best_model.predict(X_test[:, best_features])

# Calculate accuracy on the test set
test_accuracy = accuracy_score(y_test, y_test_pred)

# Display the test set results
print("\nTest Set Results:")
print(f"Test Set Accuracy: {test_accuracy:.2f}")

# Confusion matrix for the test set
cm_test = confusion_matrix(y_test, y_test_pred, labels=['non-virginica', 'virginica'])

# Display confusion matrix for the test set
print("\nConfusion Matrix for Test Set:")
print(pd.DataFrame(cm_test, index=['Actual Non-Virginica', 'Actual Virginica'], columns=['Predicted Non-Virginica', 'Predicted Virginica']))
Best Model Features: [2], Best Accuracy: 1.00

Test Set Results:
Test Set Accuracy: 1.00

Confusion Matrix for Test Set:
                      Predicted Non-Virginica  Predicted Virginica
Actual Non-Virginica                       17                    0
Actual Virginica                            0                    6

The confusion matrix confirms the model's performance:

  • All 17 actual "non-virginica" samples were correctly predicted as "non-virginica".
  • All 6 actual "virginica" samples were correctly predicted as "virginica"

Conclusion:¶

  • Feature 2 provided the best results of all the feature combinations.
  • It achieved perfect accuracy (1.00) on both validation and test sets, indicating its strong ability to distinguish between "non-virginica" and "virginica" classes.
  • The test set confusion matrix showed no false positives or false negatives, meaning all samples were correctly classified.
In [ ]: